In [1]:
import numpy as np
import pandas as pd
  • Series
    • pandas 기본 객체로 DataFrame과 함께 빈번하게 사용
    • ndarray 기반 인덱싱 추가(1차원 배열)

In [2]:
series = pd.Series(20)
series


0    20
dtype: int64

In [3]:
series[0]


20

In [4]:
series2 = pd.Series(range(1, 11))
series2


0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int32
  • index, value select

In [5]:
print(series2.values)
print(series2.index)


[ 1  2  3  4  5  6  7  8  9 10]
RangeIndex(start=0, stop=10, step=1)
  • index 지정

In [6]:
series3 = pd.Series(range(1, 5), index=['a', 'b', 'c', 'd'])
print(series3)
print('-'*50)
print(series3.values)
print('-'*50)
print(series3.index)


a    1
b    2
c    3
d    4
dtype: int32
--------------------------------------------------
[1 2 3 4]
--------------------------------------------------
Index(['a', 'b', 'c', 'd'], dtype='object')
  • value select
    • loc: index
    • iloc: 0-based index

In [7]:
series3


Out[7]:
a    1
b    2
c    3
d    4
dtype: int32

In [8]:
print(series3[1])
print(series3['b'])


2
2

In [9]:
series3[['a', 'b']]


Out[9]:
a    1
b    2
dtype: int32

In [10]:
print(series3.loc['a'])
print(series3.iloc[0])


1
1

In [11]:
print(series3.loc[['a', 'b']])
print(series3.iloc[[0, 1]])


a    1
b    2
dtype: int32
a    1
b    2
dtype: int32
  • series create and index reuse

In [12]:
series4 = pd.Series(1, index=series3.index)
series4


Out[12]:
a    1
b    1
c    1
d    1
dtype: int64

In [13]:
series5 = pd.Series(np.random.randn(5))
series5


Out[13]:
0   -0.744492
1    2.135461
2   -0.447777
3    0.043428
4   -0.240325
dtype: float64

In [14]:
series6 = pd.Series({'math':100, 'sci':80})
series6


Out[14]:
math    100
sci      80
dtype: int64
  • size, shape, unique, count

In [15]:
s = pd.Series([2, 1, 2, 3, np.nan])
s


Out[15]:
0    2.0
1    1.0
2    2.0
3    3.0
4    NaN
dtype: float64

In [16]:
print(len(s))
print(s.size)


5
5

In [17]:
# return tuple
s.shape


Out[17]:
(5,)

In [18]:
# counting
s.count()


Out[18]:
4

In [19]:
# except duplicates
s.unique()


Out[19]:
array([  2.,   1.,   3.,  nan])

In [20]:
# count values except NaN
s.value_counts()


Out[20]:
2.0    2
3.0    1
1.0    1
dtype: int64
  • head, tail, take
    • 자료 부분 출력

In [21]:
# top 5 elements
s.head()


Out[21]:
0    2.0
1    1.0
2    2.0
3    3.0
4    NaN
dtype: float64

In [22]:
# low 5 elements
s.tail()


Out[22]:
0    2.0
1    1.0
2    2.0
3    3.0
4    NaN
dtype: float64

In [23]:
s.head(n=2)


Out[23]:
0    2.0
1    1.0
dtype: float64

In [24]:
# take by 0-based index
s.take([0, 1])


Out[24]:
0    2.0
1    1.0
dtype: float64

In [25]:
s = pd.Series(range(1, 3), index=['a', 'b'])
s.take(['a', 'b'])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-25-42c73aa60c4d> in <module>()
      1 s = pd.Series(range(1, 3), index=['a', 'b'])
----> 2 s.take(['a', 'b'])

D:\Anaconda3\lib\site-packages\pandas\core\series.py in take(self, indices, axis, convert, is_copy, **kwargs)
   2430         # check/convert indicies here
   2431         if convert:
-> 2432             indices = maybe_convert_indices(indices, len(self._get_axis(axis)))
   2433 
   2434         indices = _ensure_platform_int(indices)

D:\Anaconda3\lib\site-packages\pandas\core\indexing.py in maybe_convert_indices(indices, n)
   1866             return np.empty(0, dtype=np.int_)
   1867 
-> 1868     mask = indices < 0
   1869     if mask.any():
   1870         indices[mask] += n

TypeError: unorderable types: numpy.ndarray() < int()
  • calculating by index

In [1]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])

print(s1)
print(s2)


a    1
b    2
c    3
d    4
dtype: int64
d    4
c    3
b    2
a    1
dtype: int64

In [2]:
s1 + s2


Out[2]:
a    2
b    4
c    6
d    8
dtype: int64

In [3]:
# element-wise in numpy
a1 = np.array([1, 2, 3, 4])
a2 = np.array([4, 3, 2, 1])

a1 + a2


Out[3]:
array([5, 5, 5, 5])

In [4]:
s1 * s2


Out[4]:
a     1
b     4
c     9
d    16
dtype: int64

In [5]:
s1 ** 3


Out[5]:
a     1
b     8
c    27
d    64
dtype: int64

In [6]:
s3 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'f'])
s4 = pd.Series([4, 3, 2, 1, 0], index=['d', 'c', 'b', 'a', 'g'])

s3 + s4


Out[6]:
a    2.0
b    4.0
c    6.0
d    8.0
f    NaN
g    NaN
dtype: float64
  • handling NA in pandas

In [7]:
np_array = np.array([1, 2, 3, np.NaN])
pd_series = pd.Series([1, 2, 3, np.NaN])

In [8]:
np_array.mean()


Out[8]:
nan

In [9]:
pd_series.mean()


Out[9]:
2.0

In [10]:
pd_series.mean(skipna=False)


Out[10]:
nan